/* Copyright (c) 2003 The Nutch Organization. All rights reserved. */ /* Use subject to the conditions in http://www.nutch.org/LICENSE.txt. */ package net.nutch.tools; import java.io.*; import java.util.*; import net.nutch.io.*; import net.nutch.db.*; import net.nutch.net.*; import java.util.logging.*; import net.nutch.util.*; import net.nutch.linkdb.*; import net.nutch.pagedb.*; import net.nutch.fetcher.*; /*************************************** * DistributedAnalysisTool performs link-analysis by reading * exclusively from a IWebDBReader, and writing to * an IWebDBWriter. * * This tool can be used in phases via the command line * to compute the LinkAnalysis score across many machines. * * For a single iteration of LinkAnalysis, you must have: * * 1) An "initRound" step that writes down how the work should be * divided. This outputs a "dist" directory which must be made * available to later steps. It requires the input db directory. * * 2) As many simultaneous "computeRound" steps as you like, but this * number must be determined in step 1. Each step may be run * on different machines, or on the same, or however you like. * It requires the the "db" and "dist" directories (or copies) as * inputs. Each run will output an "instructions file". * * 3) A "completeRound" step, which integrates the results of all the * many "computeRound" steps. It writes to a "db" directory. It * assumes that all the instructions files have been gathered into * a single "dist" input directory. If you're running everything * on a single filesystem, this will happen easily. If not, then * you will have to gather the files by hand (or with a script). * * For more iterations, repeat steps 1 - 3! * * @author Mike Cafarella ***************************************/ public class DistributedAnalysisTool { final private static String ASSIGN_FILE_PREFIX = "assignment"; final private static String SCORE_EDITS_FILE_PREFIX = "scoreEdits"; final private static String ASSIGN_COMPLETE = "assignComplete"; final private static float DEFAULT_SCORE = 0.15f; final private static float DECAY_VALUE = 0.85f; public static final Logger LOG = LogFormatter.getLogger("net.nutch.tools.DistributedAnalysisTool"); /** * The EditSet inner class represents all of the sorted edits * files we must process. The edit-loop can repeatedly ask * an EditSet for the "next item", and the EditSet will * seamlessly deal with opening and closing files */ class EditSet { File distDir; int numEditFiles; int curEditFile; SequenceFile.Reader curReader; /** * The "distDir" is where we find all the edit files. * The "numEditFiles" is now many we can expect to get there. */ public EditSet(File distDir, int numEditFiles) throws IOException { this.distDir = distDir; this.numEditFiles = numEditFiles; this.curEditFile = 0; getNextReader(); } /** * Get the next item for reading, closing and opening * files if necessary. Return false if there are no * more items to return. */ public synchronized boolean next(Writable key, Writable val) throws IOException { // // Open the next input stream if necessary // if (curReader == null) { getNextReader(); // Assume each edits-file has at least one entry in it. if (curReader == null) { return false; } } return curReader.next(key, val); } /** * Create the next edit reader and return it. */ private void getNextReader() throws IOException { if (curReader != null) { curReader.close(); } if (curEditFile < numEditFiles) { curReader = new SequenceFile.Reader(new File(distDir, SCORE_EDITS_FILE_PREFIX + "." + curEditFile + ".sorted").getPath()); LOG.info("Opened stream to file " + curEditFile); curEditFile++; } } /** */ public synchronized void close() throws IOException { if (curReader != null) { curReader.close(); } curEditFile = numEditFiles; } } /** * This is a Writable version of a Float. We * need this so we can store it in a SequenceFile */ class ScoreValue implements Writable { float score; float nextScore; /** */ public ScoreValue() { } /** */ public void setScore(float f) { this.score = f; } /** */ public void setNextScore(float f) { this.nextScore = f; } /** */ public float score() { return score; } /** */ public float nextScore() { return nextScore; } /** */ public void write(DataOutput out) throws IOException { out.writeFloat(score); out.writeFloat(nextScore); } /** */ public void readFields(DataInput in) throws IOException { this.score = in.readFloat(); this.nextScore = in.readFloat(); } } File dbDir; /** * Give the pagedb and linkdb files and their cache sizes */ public DistributedAnalysisTool(File dbDir) throws IOException, FileNotFoundException { this.dbDir = dbDir; } /** * This method prepares the ground for a set of processes * to distribute a round of LinkAnalysis work. It writes out * the "assignments" to a directory. This directory must be * made accessible to all the processes. (It may be mounted by * all of them, or copied to all of them.) * * This is run by a single process, and it is run first. */ public boolean initRound(int numProcesses, File distDir) throws IOException { // // The distDir must be empty or non-existent. // if ((distDir.exists() && distDir.isFile()) || (distDir.exists() && (distDir.list().length != 0))) { LOG.severe("Must be an empty or non-existent dir: " + distDir); return false; } if (! distDir.exists()) { distDir.mkdir(); } // // Figure out how many db items we have, and how many // processes they are allocated to. // long startPages[] = new long[numProcesses]; long totalPages = 0; IWebDBReader reader = new WebDBReader(dbDir); try { totalPages = reader.numPages(); } finally { reader.close(); } long chunkSize = totalPages / numProcesses; long pagesProcessedSoFar = 0; // // From zero to the 2nd-to-last item, assign a // chunk's worth of pages. The value at each index // indicates the start page for that process. // startPages[0] = 0; for (int i = 1; i < numProcesses; i++) { startPages[i] = startPages[i-1] + chunkSize; } // // Emit the assignments for the processes // try { // Write out each file for (int i = 0; i < numProcesses; i++) { DataOutputStream out = new DataOutputStream(new BufferedOutputStream(new FileOutputStream(new File(distDir, ASSIGN_FILE_PREFIX + "." + i)))); try { // Start page out.writeLong(startPages[i]); // How many pages to process if (i != numProcesses - 1) { out.writeLong(chunkSize); } else { // in last index, make up for remainders out.writeLong(totalPages - ((numProcesses - 1) * chunkSize)); } } finally { out.close(); } } // // Write a file that indicates we finished correctly. // This makes it easier for controlling scripts to // check whether this process completed. // // It also holds some overall instruction information, // so we can do some error-checking at complete-time. // File completeFile = new File(distDir, "assignComplete"); DataOutputStream out = new DataOutputStream(new BufferedOutputStream(new FileOutputStream(completeFile))); try { out.writeInt(numProcesses); out.writeLong(totalPages); // Compute extents long extent[] = new long[numProcesses]; for (int i = 0; i < numProcesses - 1; i++) { extent[i] = chunkSize * (i + 1); } extent[numProcesses-1] = totalPages - (chunkSize * (numProcesses - 1)); // Emit extents for (int i = 0; i < extent.length; i++) { out.writeLong(extent[i]); } } finally { out.close(); } return true; } catch (IOException ex) { LOG.severe(ex.toString()); LOG.severe("Sorry, could not finish assignments"); } return false; } /** * This method is invoked by one of the many processes involved * in LinkAnalysis. There will be many of these running at the * same time. That's OK, though, since there's no locking * that has to go on between them. * * This computes the LinkAnalysis score for a given region * of the database. It writes its ID, the region params, and * the scores-to-be-written into a flat file. This file is * labelled according to its processid, and is found inside distDir. */ public void computeRound(int processId, File distDir) throws IOException { File assignFile = new File(distDir, ASSIGN_FILE_PREFIX + "." + processId); long startIndex = 0, extent = 0; DataInputStream in = new DataInputStream(new BufferedInputStream(new FileInputStream(assignFile))); try { startIndex = in.readLong(); extent = in.readLong(); } finally { in.close(); } LOG.info("Start at: "+ startIndex); LOG.info("Extent: "+ extent); // // Open scoreEdits file for this process. Write down // all the score-edits we want to perform. // File scoreEdits = new File(distDir, SCORE_EDITS_FILE_PREFIX + "." + processId); SequenceFile.Writer scoreWriter = new SequenceFile.Writer(scoreEdits.getPath() + ".unsorted", UTF8.class, ScoreValue.class); // // Go through the appropriate WebDB range, and figure out // next scores // try { // Iterate through all items in the webdb, sorted by URL long curIndex = 0; ScoreValue score = new ScoreValue(); IWebDBReader reader = new WebDBReader(dbDir); try { for (Enumeration e = reader.pagesByMD5(); e.hasMoreElements(); curIndex++) { // // Find our starting place // if (curIndex < startIndex) { e.nextElement(); continue; } // // Bail if we've been here too long // if (curIndex - startIndex > extent) { break; } // // OK, do some analysis! // Page curPage = (Page) e.nextElement(); Link outLinks[] = reader.getLinks(curPage.getMD5()); int targetOutlinkers = 0; for (int i = 0; i < outLinks.length; i++) { if (outLinks[i].targetHasOutlink()) { targetOutlinkers++; } } // // For our purposes here, assume every Page // has an inlink, even though that might not // really be true. It's close enough. // // // In case there's no previous nextScore, grab // score as an approximation. // float curNextScore = curPage.getNextScore(); if (outLinks.length > 0 && curNextScore == 0.0f) { curNextScore = curPage.getScore(); } // // Compute contributions // float contributionForAll = (outLinks.length > 0) ? (curNextScore / outLinks.length) : 0.0f; float contributionForOutlinkers = (targetOutlinkers > 0) ? (curNextScore / targetOutlinkers) : 0.0f; for (int i = 0; i < outLinks.length; i++) { // emit the target URL and the contribution score.setScore(contributionForAll); score.setNextScore(outLinks[i].targetHasOutlink() ? contributionForOutlinkers : 0.0f); scoreWriter.append(outLinks[i].getURL(), score); } if (((curIndex - startIndex) % 5000) == 0) { LOG.info("Pages consumed: " + (curIndex - startIndex) + " (at index " + curIndex + ")"); } } } finally { reader.close(); } } finally { scoreWriter.close(); } // Now sort the resulting score-edits file SequenceFile.Sorter sorter = new SequenceFile.Sorter(new UTF8.Comparator(), ScoreValue.class); sorter.sort(scoreEdits.getPath() + ".unsorted", scoreEdits.getPath() + ".sorted"); new File(scoreEdits.getPath() + ".unsorted").delete(); } /** * This method collates and executes all the instructions * computed by the many executors of computeRound(). It * figures out what to write by looking at all the flat * files found in the distDir. These files are labelled * according to the processes that filled them. This method * will check to make sure all those files are present * before starting work. * * If the processors are distributed, you might have to * copy all the instruction files to a single distDir before * starting this method. * * Of course, this method is executed on only one process. * It is run last. */ public void completeRound(File distDir, File scoreFile) throws IOException { // // Load the overall assignment file, so we can // see how many processes we have and how many // operations each should include // int numProcesses = 0; long totalPages = 0; long extent[] = null; File overall = new File(distDir, "assignComplete"); DataInputStream in = new DataInputStream(new BufferedInputStream(new FileInputStream(overall))); try { numProcesses = in.readInt(); totalPages = in.readLong(); extent = new long[numProcesses]; for (int i = 0; i < numProcesses; i++) { extent[i] = in.readLong(); } } finally { in.close(); in = null; } // // Go through each instructions file we have, and // apply each one to the webdb. // ScoreStats scoreStats = new ScoreStats(); IWebDBReader reader = new WebDBReader(dbDir); IWebDBWriter writer = new WebDBWriter(dbDir); EditSet editSet = new EditSet(distDir, numProcesses); try { int count = 0; UTF8 curEditURL = new UTF8(); ScoreValue curContribution = new ScoreValue(); boolean hasEdit = editSet.next(curEditURL, curContribution); // // Go through all the Pages, in URL-sort order. // We also read from the score-edit file in URL-sort order. // for (Enumeration e = reader.pages(); e.hasMoreElements(); count++) { Page curPage = (Page) e.nextElement(); if (! hasEdit) { break; } // // Apply the current score-edit to the db item, // if appropriate // int comparison = curPage.getURL().compareTo(curEditURL); float newScore = 0.0f, newNextScore = 0.0f; if (comparison < 0) { // Fine. The edit applies to a Page we will // hit later. Ignore it, and move onto the next // Page. This should only happen with Pages // that have no incoming links, which are necessarily // special-case Pages. // // However, that means the Page's score should // be set to the minimum possible, as we have no // incoming links. newScore = (1 - DECAY_VALUE); newNextScore = (1 - DECAY_VALUE); } else if (comparison > 0) { // Error! We should never hit this situation. // It means we have a score-edit for an item // that's not found in the database! throw new IOException("Impossible situation. There is a score-edit for " + curEditURL + ", which comes after the current Page " + curPage.getURL()); } else { // // The only really interesting case is when the // score-edit and the curPage are the same. // // Sum all the contributions while (hasEdit && curPage.getURL().compareTo(curEditURL) == 0) { newScore += curContribution.score(); newNextScore += curContribution.nextScore(); hasEdit = editSet.next(curEditURL, curContribution); } newScore = (1 - DECAY_VALUE) + (DECAY_VALUE * newScore); newNextScore = (1 - DECAY_VALUE) + (DECAY_VALUE * newNextScore); } // Finally, assign it. curPage.setScore(newScore, newNextScore); writer.addPageWithScore(curPage); scoreStats.addScore(newScore); if ((count % 5000) == 0) { LOG.info("Pages written: " + count); } } LOG.info("Pages encountered: " + count); LOG.info("Target pages from init(): " + totalPages); } finally { reader.close(); editSet.close(); writer.close(); } // // Emit the score distribution info // if (scoreFile.exists()) { scoreFile.delete(); } PrintStream pout = new PrintStream(new BufferedOutputStream(new FileOutputStream(scoreFile))); try { scoreStats.emitDistribution(pout); } finally { pout.close(); } // // Delete all the distributed overhead files // FileUtil.fullyDelete(distDir); } /** * Kick off the link analysis. Submit the locations of the * Webdb and the number of iterations. * * DAT -initRound <n_processes> <dist_dir> <db_dir> * DAT -computeRound <process_id> <dist_dir> <db_dir> * DAT -completeRound <dist_dir> <db_dir> */ public static void main(String argv[]) throws IOException { if (argv.length < 2) { System.out.println("usage: java net.nutch.tools.DistributedAnalysisTool -initRound|-computeRound|-completeRound (numProcesses | processId) <dist_dir> <db_dir>"); return; } String command = null; int numProcesses = 0, processId = 0, numIterations = 0; File distDir = null, dbDir = null; for (int i = 0; i < argv.length; i++) { if ("-initRound".equals(argv[i])) { command = argv[i]; numProcesses = Integer.parseInt(argv[i+1]); distDir = new File(argv[i+2]); dbDir = new File(argv[i+3]); i+=3; } else if ("-computeRound".equals(argv[i])) { command = argv[i]; processId = Integer.parseInt(argv[i+1]); distDir = new File(argv[i+2]); dbDir = new File(argv[i+3]); i+=3; } else if ("-completeRound".equals(argv[i])) { command = argv[i]; distDir = new File(argv[i+1]); dbDir = new File(argv[i+2]); i+=2; } } System.out.println("Started at " + new Date(System.currentTimeMillis())); try { DistributedAnalysisTool dat = new DistributedAnalysisTool(dbDir); if ("-initRound".equals(command)) { dat.initRound(numProcesses, distDir); } else if ("-computeRound".equals(command)) { dat.computeRound(processId, distDir); } else if ("-completeRound".equals(command)) { dat.completeRound(distDir, new File(dbDir, "linkstats.txt")); } else { System.out.println("No directive."); } } finally { System.out.println("Finished at " + new Date(System.currentTimeMillis())); } } }